This is the work for machine faillure prediction. We are predicting, the state of the machine in one hour, accordding to what we learn from the predecedent data set.

First point is installation of the different packages and loading them if you haven’t them
#################################### Preparing the data set ################################################
# Installation of packages and loading them ###########################
#### Installation of necessary packages, need to do it only one time

#install.packages("corrplot")
# install.packages("caret")
#install.packages("randomForest")
# install.packages("MASS")
# install.packages("rpart")
# install.packages("e1071")
#install.packages("glmnet")
#install.pacakges("plotly")
#install.packages("missMDA")
#install.packages("pROC")
#install.packages("DMwR")
#install.packages("gbm")
# install.packages("rattle")
# install.packages("rpart.plot")
# install.packages("RColorBrewer")
# install.packages("party")
# install.packages("partykit")
#### Loading the necessary packages
library(pROC)  #for the roc curve methods
library("MASS")
library("rpart")
library("randomForest")
library("e1071")
library("glmnet")
library(plotly)
library(ggplot2)
library(missMDA)
library(caret)
library(DMwR)
library(rpart)                      
library(rattle)                 
library(rpart.plot)             
library(RColorBrewer)               
library(party)                  
library(partykit)               
library(caret)                  

then we set the directory loading the data set resampled in one hour interval

setwd("/home/moustapha/Energiency Big Data Project/Archive")
data = read.table("all1h1.csv", header = TRUE, sep = ",")
#data = read.table("all1h2.csv", header = TRUE, sep = ",")
DateTS <- as.POSIXlt(data$X, format = "%Y-%m-%d %H:%M:%S")
data$X = DateTS ; colnames(data)[1] = "date" ;rownames(data) = data$date

First View of the Data

##       date                         prodh            elec      
##  Min.   :2012-12-31 23:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2013-10-20 21:45:00   1st Qu.:28.04   1st Qu.:13.61  
##  Median :2014-08-09 20:30:00   Median :32.97   Median :14.23  
##  Mean   :2014-08-09 20:57:44   Mean   :27.11   Mean   :12.20  
##  3rd Qu.:2015-05-29 19:15:00   3rd Qu.:35.03   3rd Qu.:14.53  
##  Max.   :2016-03-17 18:00:00   Max.   :38.49   Max.   :15.69  
##                                NA's   :1594    NA's   :1284   
##       gram           planstop          prod      
##  Min.   :     0   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:    42   1st Qu.:1.000   1st Qu.:27.89  
##  Median :    45   Median :1.000   Median :32.48  
##  Mean   : 43640   Mean   :0.802   Mean   :26.54  
##  3rd Qu.:    45   3rd Qu.:1.000   3rd Qu.:34.61  
##  Max.   :420010   Max.   :1.000   Max.   :38.22  
##  NA's   :21747    NA's   :17623   NA's   :22957

reshaping the gram for add it in the plot

##       date                         prodh            elec      
##  Min.   :2012-12-31 23:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2013-10-20 21:45:00   1st Qu.:28.04   1st Qu.:13.61  
##  Median :2014-08-09 20:30:00   Median :32.97   Median :14.23  
##  Mean   :2014-08-09 20:57:44   Mean   :27.11   Mean   :12.20  
##  3rd Qu.:2015-05-29 19:15:00   3rd Qu.:35.03   3rd Qu.:14.53  
##  Max.   :2016-03-17 18:00:00   Max.   :38.49   Max.   :15.69  
##                                NA's   :1594    NA's   :1284   
##       gram          planstop          prod      
##  Min.   : 0.00   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:40.00   1st Qu.:1.000   1st Qu.:27.89  
##  Median :42.00   Median :1.000   Median :32.48  
##  Mean   :38.05   Mean   :0.802   Mean   :26.54  
##  3rd Qu.:45.00   3rd Qu.:1.000   3rd Qu.:34.61  
##  Max.   :48.80   Max.   :1.000   Max.   :38.22  
##  NA's   :21747   NA's   :17623   NA's   :22957

then selecting the Data set containing only the complete plan stop

##       date                         prodh            elec        
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.0027  
##  1st Qu.:2015-04-24 15:00:00   1st Qu.: 7.44   1st Qu.:12.8713  
##  Median :2015-08-12 00:00:00   Median :32.42   Median :13.7709  
##  Mean   :2015-08-12 00:31:11   Mean   :24.32   Mean   :11.0050  
##  3rd Qu.:2015-11-29 09:00:00   3rd Qu.:34.55   3rd Qu.:14.2445  
##  Max.   :2016-03-17 18:00:00   Max.   :38.34   Max.   :15.3352  
##                                NA's   :1384    NA's   :1259     
##       gram          planstop           prod      
##  Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:40.00   1st Qu.:1.0000   1st Qu.:27.89  
##  Median :42.00   Median :1.0000   Median :32.48  
##  Mean   :38.05   Mean   :0.8023   Mean   :26.54  
##  3rd Qu.:45.00   3rd Qu.:1.0000   3rd Qu.:34.61  
##  Max.   :48.80   Max.   :1.0000   Max.   :38.22  
##  NA's   :4124                     NA's   :5334

according to the summary and the plot we will consider the variables that have less missing values it is prodh and elec we will also consider the data up to “2016-01-26 07:00:00”

data1 = nomissing[1:which(nomissing$date == "2016-01-26 07:00:00"),]
summary(data1)
##       date                         prodh            elec          
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.002683  
##  1st Qu.:2015-04-11 18:15:00   1st Qu.: 7.44   1st Qu.:12.871267  
##  Median :2015-07-17 06:30:00   Median :32.42   Median :13.770942  
##  Mean   :2015-07-17 06:57:21   Mean   :24.32   Mean   :11.005025  
##  3rd Qu.:2015-10-21 18:45:00   3rd Qu.:34.55   3rd Qu.:14.244475  
##  Max.   :2016-01-26 07:00:00   Max.   :38.34   Max.   :15.335150  
##                                NA's   :149     NA's   :24         
##       gram          planstop           prod      
##  Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:22.50   1st Qu.:1.0000   1st Qu.:27.89  
##  Median :42.00   Median :1.0000   Median :32.48  
##  Mean   :37.53   Mean   :0.7849   Mean   :26.54  
##  3rd Qu.:45.00   3rd Qu.:1.0000   3rd Qu.:34.61  
##  Max.   :48.80   Max.   :1.0000   Max.   :38.22  
##  NA's   :3463                     NA's   :4099

we replace the missing values in prodh by the value in prod

##       date                         prodh             elec          
##  Min.   :2015-01-05 06:00:00   Min.   : 0.000   Min.   : 0.002683  
##  1st Qu.:2015-04-11 18:15:00   1st Qu.: 9.827   1st Qu.:12.871267  
##  Median :2015-07-17 06:30:00   Median :32.421   Median :13.770942  
##  Mean   :2015-07-17 06:57:21   Mean   :24.397   Mean   :11.005025  
##  3rd Qu.:2015-10-21 18:45:00   3rd Qu.:34.552   3rd Qu.:14.244475  
##  Max.   :2016-01-26 07:00:00   Max.   :38.337   Max.   :15.335150  
##                                NA's   :48       NA's   :24         
##       gram          planstop           prod      
##  Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:22.50   1st Qu.:1.0000   1st Qu.:27.89  
##  Median :42.00   Median :1.0000   Median :32.48  
##  Mean   :37.53   Mean   :0.7849   Mean   :26.54  
##  3rd Qu.:45.00   3rd Qu.:1.0000   3rd Qu.:34.61  
##  Max.   :48.80   Max.   :1.0000   Max.   :38.22  
##  NA's   :3463                     NA's   :4099

Lets check the proporportion of missing values in our Data set

##           
##            FALSE TRUE
##   date      9266    0
##   prodh     9218   48
##   elec      9242   24
##   gram      5803 3463
##   planstop  9266    0
##   prod      5167 4099

it still left some missing values in prodh “48” and elec “24”, we will input them with the method inputpca in missMDA.

## $ncp
## [1] 1
## 
## $criterion
##        0        1        2 
## 78.14242 11.80203 13.30586
##      prodh            elec              planstop     
##  Min.   : 0.00   Min.   : 0.002683   Min.   :0.0000  
##  1st Qu.:10.75   1st Qu.:12.880504   1st Qu.:1.0000  
##  Median :32.41   Median :13.768533   Median :1.0000  
##  Mean   :24.42   Mean   :11.006553   Mean   :0.7849  
##  3rd Qu.:34.54   3rd Qu.:14.243908   3rd Qu.:1.0000  
##  Max.   :38.34   Max.   :15.335150   Max.   :1.0000

Plot of the final data set whithout missing value

Second point Creation of the varible faillure that will be ou target

##       date                         prodh            elec          
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.002683  
##  1st Qu.:2015-04-11 18:15:00   1st Qu.:10.75   1st Qu.:12.880504  
##  Median :2015-07-17 06:30:00   Median :32.41   Median :13.768533  
##  Mean   :2015-07-17 06:57:21   Mean   :24.42   Mean   :11.006553  
##  3rd Qu.:2015-10-21 18:45:00   3rd Qu.:34.54   3rd Qu.:14.243908  
##  Max.   :2016-01-26 07:00:00   Max.   :38.34   Max.   :15.335150  
##     planstop           State     
##  Min.   :0.0000   Faillure: 480  
##  1st Qu.:1.0000   Normal  :8786  
##  Median :1.0000                  
##  Mean   :0.7849                  
##  3rd Qu.:1.0000                  
##  Max.   :1.0000
## 
##  Faillure    Normal 
##  5.180229 94.819771
## 
##  Faillure       fix    Normal 
##  1.802288  3.377941 94.819771
## 
##  Faillure    Normal 
##  1.802288 98.197712

for the moment we have only five variables in our data set, lets create other ones from our data set, that we will use for our modelisations

Lets create the variable working time wich count the working time of the machine

worktime=c()
cou=0
for (i in 1:nrow(data2)){
  if(data2$State[i]=="Normal"){
    cou=cou+1
    worktime[i]=cou
  }
  else {
    cou=0
    worktime[i]=cou
  }
}

data2$worktime=worktime

Creation of varible, for consider the production and the electricity variation

Then we create Creation a function, for make a transformation on our variable (log(var+1), sqrt, power2,power3,poly)

we also make a Decomposition of the date in months and weekdays and adding it like variables

we make a function for new variable from time decalage

## a function for new variable from time decalage

varcreation = function(number,data,var) {
  n = length(data[,1])
  k = which(colnames(d1) == var)
  for (i in 1:number) {
    p = c(rep(NA,i), data[,k][-c((n - i + 1):n)])
    data[length(data) + 1] = p
  }
  return (data)
}

d1 = data2



## creation of decaled state
decal=function(var, numb=24){
  n = length(d1)
  numb = 24
  d1 = varcreation(numb,d1,var)
  namm1 = paste(rep(var,numb),c(1:numb),sep = ".")
  colnames(d1)[(n + 1):(n + numb)] <- paste(rep(var,numb),c(1:numb),sep = "..")
  return(d1)
}

tode=colnames(d1)[-c(1)]
for (i in 1:length(tode)){
  pp=decal(tode[i],numb = 24)
  d1=pp
}

## factor probleme management
numb=24
namm1 = paste(rep("State",numb),c(1:numb),sep = "..")
namm2 = paste(rep("ffail",numb),c(1:numb),sep = "..")
namm3 = paste(rep("ffail2",numb),c(1:numb),sep = "..")
namm4 = paste(rep("months",numb),c(1:numb),sep = "..")
namm5 = paste(rep("wday",numb),c(1:numb),sep = "..")
for (i in 1:numb) {
  d1[,which(colnames(d1) == namm1[i])] = as.factor(d1[,which(colnames(d1) == namm1[i])])
  levels(d1[,which(colnames(d1) == namm1[i])])=levels(d1$State)
  
  d1[,which(colnames(d1) == namm2[i])] = as.factor(d1[,which(colnames(d1) == namm2[i])])
  levels(d1[,which(colnames(d1) == namm2[i])])=levels(d1$ffail)
  
  d1[,which(colnames(d1) == namm3[i])] = as.factor(d1[,which(colnames(d1) == namm3[i])])
  levels(d1[,which(colnames(d1) == namm3[i])])=levels(d1$ffail2)
  
  d1[,which(colnames(d1) == namm4[i])] = as.factor(d1[,which(colnames(d1) == namm4[i])])
  levels(d1[,which(colnames(d1) == namm4[i])])=levels(d1$months)
  
  d1[,which(colnames(d1) == namm5[i])] = as.factor(d1[,which(colnames(d1) == namm5[i])])
  levels(d1[,which(colnames(d1) == namm5[i])])=levels(d1$wday)
}

creation of the working data set

## creation of the working data set

df = d1
df=df[,-c(1:3,8:28)]

df = na.omit(df)

## Using DF for fit our different models

x = df[,-c(2,3,4)]
y1 = df[,2]
y2 = df[,4]
y3 = df[,3]

df=data.frame(y1,y2,y3,x)

pp=lapply(x,as.numeric)
x=as.data.frame(pp)
rownames(x)=rownames(df)

Some interesting plot

## Faillure   Normal 
##      480     8761

## Faillure   Normal 
##      167     9074

## Faillure      fix   Normal 
##      167      313     8761

making the test and trainning splits

## ytrain1
##   Faillure     Normal 
## 0.05194002 0.94805998
## ytest1
##   Faillure     Normal 
## 0.05194805 0.94805195
## ytrain1T
## Faillure   Normal 
## 0.050875 0.949125
## ytest1T
##   Faillure     Normal 
## 0.05882353 0.94117647
## ytrain2
##   Faillure     Normal 
## 0.01855001 0.98144999
## ytest2
##   Faillure     Normal 
## 0.01695527 0.98304473
## ytrain2T
## Faillure   Normal 
## 0.017625 0.982375
## ytest2T
##   Faillure     Normal 
## 0.02095085 0.97904915

Creation of super-ressampling Data

new Proportion of Faillure in randomly case

## 
##  Faillure    Normal 
## 0.3076923 0.6923077

new Proportion of Faillure in time slice case

## 
##  Faillure    Normal 
## 0.3076923 0.6923077

After Data preparation, lets setup the environnement

Application of treebag model

### Tree bag model

ctrl <- trainControl(method = "cv", number = 2, classProbs = TRUE)
tbmodel=function(xd,yd, ctrl){
  registerDoMC(cores = 5)
  model <- train(x=xd,y=yd,
                 method = "treebag",
                 trControl = ctrl
  )
  return (model)
}


########## the first case machine learning approches ####


#### Only the first faillure prediction
ytr=ytrain2
ytest=ytest2
xtr=xtrain
xte=xtest
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.32
# predicting


predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.05,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       24    257
##   Normal         23   2468
mean(pred==ytest)
## [1] 0.8989899
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 47 controls (as.numeric(ytest) 1) < 2725 cases (as.numeric(ytest) 2).
## Area under the curve: 0.7082
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.5106383
specificity
## [1] 0.9056881
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

#### smote data Prediction of the first faillure
ytr=dfSmote2$ytrain2
ytest=ytest2
xtr=dfSmote2[,-1]
xte=xtest
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.36
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       35    729
##   Normal         12   1996
mean(pred==ytest)
## [1] 0.732684
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 47 controls (as.numeric(ytest) 1) < 2725 cases (as.numeric(ytest) 2).
## Area under the curve: 0.7386
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.7446809
specificity
## [1] 0.7324771
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

############ second approch in case of time slice ####

#### Predicting all the faillures
# the second case with the first faillure ###
ytr=ytrain2T
ytest=ytest2T
xtr=xtrainT
xte=xtestT
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.32
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        5      3
##   Normal         21   1212
mean(pred==ytest)
## [1] 0.9806608
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 26 controls (as.numeric(ytest) 1) < 1215 cases (as.numeric(ytest) 2).
## Area under the curve: 0.5949
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.1923077
specificity
## [1] 0.9975309
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

###### time slice
#### Prediction of the first faillure # approch with smote data
ytr=dfSmote2T$ytrain2T
ytest=ytest2T
xtr=dfSmote2T[,-1]
xte=xtestT
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.4
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       14    347
##   Normal         12    868
mean(pred==ytest)
## [1] 0.7107172
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 26 controls (as.numeric(ytest) 1) < 1215 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6264
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.5384615
specificity
## [1] 0.7144033
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)